# Import all the libraries
import numpy as np # linear algebra
import pandas as pd # data processing, data manupilation
# visualization
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
%matplotlib inline
import plotly.offline as py
import plotly.graph_objs as go
import plotly.subplots as sp
import plotly.figure_factory as ff
#remove warnings
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("bank-customer-churn.csv", index_col=0)
data
| Row Number | Customer Id | Surname | Credit Score | Geography | Gender | Age | Tenure | Balance | Num Of Products | Has CrCard | Is Active Member | Estimated Salary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 15634602 | Hargrave | 619 | France | Female | 42 | 2 | 0.00 | 1 | Yes | Yes | 101348.88 | Yes |
| 1 | 2 | 15647311 | Hill | 608 | Spain | Female | 41 | 1 | 83807.86 | 1 | No | Yes | 112542.58 | No |
| 2 | 3 | 15619304 | Onio | 502 | France | Female | 42 | 8 | 159660.80 | 3 | Yes | No | 113931.57 | Yes |
| 3 | 4 | 15701354 | Boni | 699 | France | Female | 39 | 1 | 0.00 | 2 | No | No | 93826.63 | No |
| 4 | 5 | 15737888 | Mitchell | 850 | Spain | Female | 43 | 2 | 125510.82 | 1 | Yes | Yes | 79084.1 | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9995 | 9996 | 15606229 | Obijiaku | 771 | France | Male | 39 | 5 | 0.00 | 2 | Yes | No | 96270.64 | No |
| 9996 | 9997 | 15569892 | Johnstone | 516 | France | Male | 35 | 10 | 57369.61 | 1 | Yes | Yes | 101699.77 | No |
| 9997 | 9998 | 15584532 | Liu | 709 | France | Female | 36 | 7 | 0.00 | 1 | No | Yes | 42085.58 | Yes |
| 9998 | 9999 | 15682355 | Sabbatini | 772 | Germany | Male | 42 | 3 | 75075.31 | 2 | Yes | No | 92888.52 | Yes |
| 9999 | 10000 | 15628319 | Walker | 792 | France | Female | 28 | 4 | 130142.79 | 1 | Yes | No | 38190.78 | No |
10000 rows × 14 columns
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 10000 entries, 0 to 9999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Row Number 10000 non-null int64 1 Customer Id 10000 non-null int64 2 Surname 10000 non-null object 3 Credit Score 10000 non-null object 4 Geography 10000 non-null object 5 Gender 10000 non-null object 6 Age 10000 non-null int64 7 Tenure 10000 non-null int64 8 Balance 10000 non-null float64 9 Num Of Products 10000 non-null int64 10 Has CrCard 10000 non-null object 11 Is Active Member 10000 non-null object 12 Estimated Salary 10000 non-null object 13 Exited 10000 non-null object dtypes: float64(1), int64(5), object(8) memory usage: 1.1+ MB
data.describe()
| Row Number | Customer Id | Age | Tenure | Balance | Num Of Products | |
|---|---|---|---|---|---|---|
| count | 10000.00000 | 1.000000e+04 | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 |
| mean | 5000.50000 | 1.569094e+07 | 38.921800 | 5.012800 | 76485.889288 | 1.530200 |
| std | 2886.89568 | 7.193619e+04 | 10.487806 | 2.892174 | 62397.405202 | 0.581654 |
| min | 1.00000 | 1.556570e+07 | 18.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 2500.75000 | 1.562853e+07 | 32.000000 | 3.000000 | 0.000000 | 1.000000 |
| 50% | 5000.50000 | 1.569074e+07 | 37.000000 | 5.000000 | 97198.540000 | 1.000000 |
| 75% | 7500.25000 | 1.575323e+07 | 44.000000 | 7.000000 | 127644.240000 | 2.000000 |
| max | 10000.00000 | 1.581569e+07 | 92.000000 | 10.000000 | 250898.090000 | 4.000000 |
print ("Rows : " ,data.shape[0])
print ("Columns : " ,data.shape[1])
print ("\nFeatures : \n" ,data.columns.tolist())
print ("\nUnique values : \n",data.nunique())
Rows : 10000 Columns : 14 Features : ['Row Number', 'Customer Id', 'Surname', 'Credit Score', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'Num Of Products', 'Has CrCard', 'Is Active Member', 'Estimated Salary', 'Exited'] Unique values : Row Number 10000 Customer Id 10000 Surname 2932 Credit Score 461 Geography 3 Gender 2 Age 70 Tenure 11 Balance 6382 Num Of Products 4 Has CrCard 2 Is Active Member 2 Estimated Salary 9993 Exited 2 dtype: int64
data.isnull().sum()
Row Number 0 Customer Id 0 Surname 0 Credit Score 0 Geography 0 Gender 0 Age 0 Tenure 0 Balance 0 Num Of Products 0 Has CrCard 0 Is Active Member 0 Estimated Salary 0 Exited 0 dtype: int64
headers = ['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography','Gender','Age','Tenure',
'Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary', 'Churn']
df1= data.copy()
df1.columns = headers
df1.head()
| RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 15634602 | Hargrave | 619 | France | Female | 42 | 2 | 0.00 | 1 | Yes | Yes | 101348.88 | Yes |
| 1 | 2 | 15647311 | Hill | 608 | Spain | Female | 41 | 1 | 83807.86 | 1 | No | Yes | 112542.58 | No |
| 2 | 3 | 15619304 | Onio | 502 | France | Female | 42 | 8 | 159660.80 | 3 | Yes | No | 113931.57 | Yes |
| 3 | 4 | 15701354 | Boni | 699 | France | Female | 39 | 1 | 0.00 | 2 | No | No | 93826.63 | No |
| 4 | 5 | 15737888 | Mitchell | 850 | Spain | Female | 43 | 2 | 125510.82 | 1 | Yes | Yes | 79084.1 | No |
data.head()
| Row Number | Customer Id | Surname | Credit Score | Geography | Gender | Age | Tenure | Balance | Num Of Products | Has CrCard | Is Active Member | Estimated Salary | Exited | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 15634602 | Hargrave | 619 | France | Female | 42 | 2 | 0.00 | 1 | Yes | Yes | 101348.88 | Yes |
| 1 | 2 | 15647311 | Hill | 608 | Spain | Female | 41 | 1 | 83807.86 | 1 | No | Yes | 112542.58 | No |
| 2 | 3 | 15619304 | Onio | 502 | France | Female | 42 | 8 | 159660.80 | 3 | Yes | No | 113931.57 | Yes |
| 3 | 4 | 15701354 | Boni | 699 | France | Female | 39 | 1 | 0.00 | 2 | No | No | 93826.63 | No |
| 4 | 5 | 15737888 | Mitchell | 850 | Spain | Female | 43 | 2 | 125510.82 | 1 | Yes | Yes | 79084.1 | No |
missing_data = df1.isnull()
# Count missing values
for column in missing_data.columns.values.tolist():
print(column)
print (missing_data[column].value_counts())
print("")
RowNumber False 10000 Name: RowNumber, dtype: int64 CustomerId False 10000 Name: CustomerId, dtype: int64 Surname False 10000 Name: Surname, dtype: int64 CreditScore False 10000 Name: CreditScore, dtype: int64 Geography False 10000 Name: Geography, dtype: int64 Gender False 10000 Name: Gender, dtype: int64 Age False 10000 Name: Age, dtype: int64 Tenure False 10000 Name: Tenure, dtype: int64 Balance False 10000 Name: Balance, dtype: int64 NumOfProducts False 10000 Name: NumOfProducts, dtype: int64 HasCrCard False 10000 Name: HasCrCard, dtype: int64 IsActiveMember False 10000 Name: IsActiveMember, dtype: int64 EstimatedSalary False 10000 Name: EstimatedSalary, dtype: int64 Churn False 10000 Name: Churn, dtype: int64
#Replacing spaces with null values
df1 = df1.replace(" ",np.nan)
missing_data = df1.isnull()
#use code for count missing values
#Dropping null values from credit score column which contain 0.04% missing data
df1 = df1[df1["CreditScore"].notnull()]
df1 = df1.reset_index()[df1.columns]
#convert to float type
df1["CreditScore"] = df1["CreditScore"].astype(float)
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9996 entries, 0 to 9995 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 RowNumber 9996 non-null int64 1 CustomerId 9996 non-null int64 2 Surname 9996 non-null object 3 CreditScore 9996 non-null float64 4 Geography 9996 non-null object 5 Gender 9996 non-null object 6 Age 9996 non-null int64 7 Tenure 9996 non-null int64 8 Balance 9996 non-null float64 9 NumOfProducts 9996 non-null int64 10 HasCrCard 9996 non-null object 11 IsActiveMember 9996 non-null object 12 EstimatedSalary 9991 non-null object 13 Churn 9996 non-null object dtypes: float64(2), int64(5), object(7) memory usage: 1.1+ MB
# Write code for Dropping null values from Estimated Salary column which contain 0.04% missing data
#convert to float type
df1["EstimatedSalary"] = df1["EstimatedSalary"].astype(float)
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9996 entries, 0 to 9995 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 RowNumber 9996 non-null int64 1 CustomerId 9996 non-null int64 2 Surname 9996 non-null object 3 CreditScore 9996 non-null float64 4 Geography 9996 non-null object 5 Gender 9996 non-null object 6 Age 9996 non-null int64 7 Tenure 9996 non-null int64 8 Balance 9996 non-null float64 9 NumOfProducts 9996 non-null int64 10 HasCrCard 9996 non-null object 11 IsActiveMember 9996 non-null object 12 EstimatedSalary 9991 non-null float64 13 Churn 9996 non-null object dtypes: float64(3), int64(5), object(6) memory usage: 1.1+ MB
df1
| RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 15634602 | Hargrave | 619.0 | France | Female | 42 | 2 | 0.00 | 1 | Yes | Yes | 101348.88 | Yes |
| 1 | 2 | 15647311 | Hill | 608.0 | Spain | Female | 41 | 1 | 83807.86 | 1 | No | Yes | 112542.58 | No |
| 2 | 3 | 15619304 | Onio | 502.0 | France | Female | 42 | 8 | 159660.80 | 3 | Yes | No | 113931.57 | Yes |
| 3 | 4 | 15701354 | Boni | 699.0 | France | Female | 39 | 1 | 0.00 | 2 | No | No | 93826.63 | No |
| 4 | 5 | 15737888 | Mitchell | 850.0 | Spain | Female | 43 | 2 | 125510.82 | 1 | Yes | Yes | 79084.10 | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9991 | 9996 | 15606229 | Obijiaku | 771.0 | France | Male | 39 | 5 | 0.00 | 2 | Yes | No | 96270.64 | No |
| 9992 | 9997 | 15569892 | Johnstone | 516.0 | France | Male | 35 | 10 | 57369.61 | 1 | Yes | Yes | 101699.77 | No |
| 9993 | 9998 | 15584532 | Liu | 709.0 | France | Female | 36 | 7 | 0.00 | 1 | No | Yes | 42085.58 | Yes |
| 9994 | 9999 | 15682355 | Sabbatini | 772.0 | Germany | Male | 42 | 3 | 75075.31 | 2 | Yes | No | 92888.52 | Yes |
| 9995 | 10000 | 15628319 | Walker | 792.0 | France | Female | 28 | 4 | 130142.79 | 1 | Yes | No | 38190.78 | No |
9996 rows × 14 columns
#get dummies for HasCrCard
dv1 = pd.get_dummies(df1["HasCrCard"])
dv1.head()
| No | Yes | |
|---|---|---|
| 0 | 0 | 1 |
| 1 | 1 | 0 |
| 2 | 0 | 1 |
| 3 | 1 | 0 |
| 4 | 0 | 1 |
#Rename columns
dv1.rename(columns={'Yes':'HasCrCard_Yes', 'No':'HasCrCard_No'}, inplace=True)
dv1.head()
| HasCrCard_No | HasCrCard_Yes | |
|---|---|---|
| 0 | 0 | 1 |
| 1 | 1 | 0 |
| 2 | 0 | 1 |
| 3 | 1 | 0 |
| 4 | 0 | 1 |
# merge data
df1 = pd.concat([df1, dv1], axis=1)
df1.head()
| RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedSalary | Churn | HasCrCard_No | HasCrCard_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 15634602 | Hargrave | 619.0 | France | Female | 42 | 2 | 0.00 | 1 | Yes | Yes | 101348.88 | Yes | 0 | 1 |
| 1 | 2 | 15647311 | Hill | 608.0 | Spain | Female | 41 | 1 | 83807.86 | 1 | No | Yes | 112542.58 | No | 1 | 0 |
| 2 | 3 | 15619304 | Onio | 502.0 | France | Female | 42 | 8 | 159660.80 | 3 | Yes | No | 113931.57 | Yes | 0 | 1 |
| 3 | 4 | 15701354 | Boni | 699.0 | France | Female | 39 | 1 | 0.00 | 2 | No | No | 93826.63 | No | 1 | 0 |
| 4 | 5 | 15737888 | Mitchell | 850.0 | Spain | Female | 43 | 2 | 125510.82 | 1 | Yes | Yes | 79084.10 | No | 0 | 1 |
# drop original HasCrCard
df1.drop("HasCrCard", axis = 1, inplace=True)
df1.head()
| RowNumber | CustomerId | Surname | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | IsActiveMember | EstimatedSalary | Churn | HasCrCard_No | HasCrCard_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 15634602 | Hargrave | 619.0 | France | Female | 42 | 2 | 0.00 | 1 | Yes | 101348.88 | Yes | 0 | 1 |
| 1 | 2 | 15647311 | Hill | 608.0 | Spain | Female | 41 | 1 | 83807.86 | 1 | Yes | 112542.58 | No | 1 | 0 |
| 2 | 3 | 15619304 | Onio | 502.0 | France | Female | 42 | 8 | 159660.80 | 3 | No | 113931.57 | Yes | 0 | 1 |
| 3 | 4 | 15701354 | Boni | 699.0 | France | Female | 39 | 1 | 0.00 | 2 | No | 93826.63 | No | 1 | 0 |
| 4 | 5 | 15737888 | Mitchell | 850.0 | Spain | Female | 43 | 2 | 125510.82 | 1 | Yes | 79084.10 | No | 0 | 1 |
# write code for get_dummies method for IsActiveMember
df1.Tenure.unique()
array([ 2, 1, 8, 7, 4, 6, 3, 10, 5, 9, 0], dtype=int64)
# Convert Tenure to categorical column
def tenure_lab(df1) :
if df1["Tenure"] <= 3 :
return "Tenure_0-3"
elif (df1["Tenure"] > 3) & (df1["Tenure"] <= 6 ):
return "Tenure_3-6"
elif (df1["Tenure"] > 6) & (df1["Tenure"] <= 10) :
return "Tenure_6-10"
df1["Tenure_group"] = df1.apply(lambda df1:tenure_lab(df1),
axis = 1)
df1.Tenure_group.unique()
array(['Tenure_0-3', 'Tenure_6-10', 'Tenure_3-6'], dtype=object)
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9996 entries, 0 to 9995 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 RowNumber 9996 non-null int64 1 CustomerId 9996 non-null int64 2 Surname 9996 non-null object 3 CreditScore 9996 non-null float64 4 Geography 9996 non-null object 5 Gender 9996 non-null object 6 Age 9996 non-null int64 7 Tenure 9996 non-null int64 8 Balance 9996 non-null float64 9 NumOfProducts 9996 non-null int64 10 IsActiveMember 9996 non-null object 11 EstimatedSalary 9991 non-null float64 12 Churn 9996 non-null object 13 HasCrCard_No 9996 non-null uint8 14 HasCrCard_Yes 9996 non-null uint8 15 Tenure_group 9996 non-null object dtypes: float64(3), int64(5), object(6), uint8(2) memory usage: 1.1+ MB
#Separating churn and non churn customers
churn = df1[df1["Churn"] == 'Yes']
not_churn = df1[df1["Churn"] == 'No']
print ('Customers who stayed', len(not_churn))
print ('Customers who left', len(churn))
Customers who stayed 7961 Customers who left 2035
# rcParams for upcoming plots
rcParams['figure.figsize'] = 8,6
# Create a scatter plot
sns.scatterplot(data = df1, x =df1['Tenure'], y = df1['Balance'], hue=df1.Churn.tolist())
<AxesSubplot:xlabel='Tenure', ylabel='Balance'>
sns.boxplot(data= df1, x="Churn", y="Balance")
<AxesSubplot:xlabel='Churn', ylabel='Balance'>
#labels
lab = df1["Churn"].value_counts().keys().tolist()
#values
val = df1["Churn"].value_counts().values.tolist()
trace = go.Pie(labels = lab ,
values = val ,
marker = dict(colors = [ 'royalblue' ,'lime'],
line = dict(color = "white",
width = 1.3)
),
rotation = 90,
hoverinfo = "label+value+text",
hole = .5
)
layout = go.Layout(dict(title = "Churn vs. Non-Churn",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
)
data = [trace]
fig = go.Figure(data = data,layout = layout)
py.iplot(fig)
def plot_pie(column) :
trace1 = go.Pie(values = churn[column].value_counts().values.tolist(),
labels = churn[column].value_counts().keys().tolist(),
hoverinfo = "label+percent+name",
domain = dict(x = [0,.48]),
name = "Churn Customers",
marker = dict(line = dict(width = 2,
color = "rgb(243,243,243)")
),
hole = .6
)
trace2 = go.Pie(values = not_churn[column].value_counts().values.tolist(),
labels = not_churn[column].value_counts().keys().tolist(),
hoverinfo = "label+percent+name",
marker = dict(line = dict(width = 2,
color = "rgb(243,243,243)")
),
domain = dict(x = [.52,1]),
hole = .6,
name = "Non churn customers"
)
layout = go.Layout(dict(title = column + " distribution in customer Churn ",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
annotations = [dict(text = "churn customers",
font = dict(size = 13),
showarrow = False,
x = .15, y = .5),
dict(text = "Non churn customers",
font = dict(size = 13),
showarrow = False,
x = .88,y = .5
)
]
)
)
data = [trace1,trace2]
fig = go.Figure(data = data,layout = layout)
py.iplot(fig)
df1.columns
Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'IsActiveMember',
'EstimatedSalary', 'Churn', 'HasCrCard_No', 'HasCrCard_Yes',
'Tenure_group'],
dtype='object')
cat_cols = ['Geography', 'Gender', 'Tenure_group','NumOfProducts']
# for all categorical columns plot pie
for i in cat_cols :
plot_pie(i)
#function for histogram for customer attrition types
def histogram(column) :
trace1 = go.Histogram(x = churn[column],
histnorm= "percent",
name = "Churn Customers",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
trace2 = go.Histogram(x = not_churn[column],
histnorm = "percent",
name = "Non churn customers",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
data = [trace1,trace2]
layout = go.Layout(dict(title =column + " distribution in customer Churn ",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = column,
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "percent",
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
)
)
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
df1.columns
Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'IsActiveMember',
'EstimatedSalary', 'Churn', 'HasCrCard_No', 'HasCrCard_Yes',
'Tenure_group'],
dtype='object')
num_cols = ['Age','CreditScore','Tenure', 'Balance','EstimatedSalary']
#for all categorical columns plot histogram
for i in num_cols :
histogram(i)
plt.show()
# Save the cleaned file
df1.to_csv('clean_df.csv')